age - age
bp - blood pressure
sg - specific gravity
al - albumin
su - sugar
rbc - red blood cells
pc - pus cell
pcc - pus cell clumps
ba - bacteria
bgr - blood glucose random
bu - blood urea
sc - serum creatinine
sod - sodium
pot - potassium
hemo - hemoglobin
pcv - packed cell volume
wc - white blood cell count
rc - red blood cell count
htn - hypertension
dm - diabetes mellitus
cad - coronary artery disease
appet - appetite
pe - pedal edema
ane - anemia
class - class ( Here ckd means patient have Chronic kidney disease and not ckd indicated absence of the same)
# necessary imports
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 26)
# Downloading the DATASET from the KAGGLE.COM project site. This automatically creates a project directory and places the dataset file there.
od.download("https://www.kaggle.com/mansoordaku/ckdisease")
# loading data from the downloaded file path...
df= pd.read_csv('./ckdisease/kidney_disease.csv')
df.head()
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds Your Kaggle username: niranjanreddykaya Your Kaggle Key: ········
100%|█████████████████████████████████████████████████████████████████████████████████████| 9.51k/9.51k [00:00<?, ?B/s]
Downloading ckdisease.zip to .\ckdisease
| id | age | bp | sg | al | su | rbc | pc | pcc | ba | bgr | bu | sc | sod | pot | hemo | pcv | wc | rc | htn | dm | cad | appet | pe | ane | classification | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 48.0 | 80.0 | 1.020 | 1.0 | 0.0 | NaN | normal | notpresent | notpresent | 121.0 | 36.0 | 1.2 | NaN | NaN | 15.4 | 44 | 7800 | 5.2 | yes | yes | no | good | no | no | ckd |
| 1 | 1 | 7.0 | 50.0 | 1.020 | 4.0 | 0.0 | NaN | normal | notpresent | notpresent | NaN | 18.0 | 0.8 | NaN | NaN | 11.3 | 38 | 6000 | NaN | no | no | no | good | no | no | ckd |
| 2 | 2 | 62.0 | 80.0 | 1.010 | 2.0 | 3.0 | normal | normal | notpresent | notpresent | 423.0 | 53.0 | 1.8 | NaN | NaN | 9.6 | 31 | 7500 | NaN | no | yes | no | poor | no | yes | ckd |
| 3 | 3 | 48.0 | 70.0 | 1.005 | 4.0 | 0.0 | normal | abnormal | present | notpresent | 117.0 | 56.0 | 3.8 | 111.0 | 2.5 | 11.2 | 32 | 6700 | 3.9 | yes | no | no | poor | yes | yes | ckd |
| 4 | 4 | 51.0 | 80.0 | 1.010 | 2.0 | 0.0 | normal | normal | notpresent | notpresent | 106.0 | 26.0 | 1.4 | NaN | NaN | 11.6 | 35 | 7300 | 4.6 | no | no | no | good | no | no | ckd |
df.shape
(400, 26)
# dropping id column
df.drop('id', axis = 1, inplace = True)
# rename column names to make it more user-friendly
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
'aanemia', 'class']
df.shape
(400, 25)
df.head()
| age | blood_pressure | specific_gravity | albumin | sugar | red_blood_cells | pus_cell | pus_cell_clumps | bacteria | blood_glucose_random | blood_urea | serum_creatinine | sodium | potassium | haemoglobin | packed_cell_volume | white_blood_cell_count | red_blood_cell_count | hypertension | diabetes_mellitus | coronary_artery_disease | appetite | peda_edema | aanemia | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 48.0 | 80.0 | 1.020 | 1.0 | 0.0 | NaN | normal | notpresent | notpresent | 121.0 | 36.0 | 1.2 | NaN | NaN | 15.4 | 44 | 7800 | 5.2 | yes | yes | no | good | no | no | ckd |
| 1 | 7.0 | 50.0 | 1.020 | 4.0 | 0.0 | NaN | normal | notpresent | notpresent | NaN | 18.0 | 0.8 | NaN | NaN | 11.3 | 38 | 6000 | NaN | no | no | no | good | no | no | ckd |
| 2 | 62.0 | 80.0 | 1.010 | 2.0 | 3.0 | normal | normal | notpresent | notpresent | 423.0 | 53.0 | 1.8 | NaN | NaN | 9.6 | 31 | 7500 | NaN | no | yes | no | poor | no | yes | ckd |
| 3 | 48.0 | 70.0 | 1.005 | 4.0 | 0.0 | normal | abnormal | present | notpresent | 117.0 | 56.0 | 3.8 | 111.0 | 2.5 | 11.2 | 32 | 6700 | 3.9 | yes | no | no | poor | yes | yes | ckd |
| 4 | 51.0 | 80.0 | 1.010 | 2.0 | 0.0 | normal | normal | notpresent | notpresent | 106.0 | 26.0 | 1.4 | NaN | NaN | 11.6 | 35 | 7300 | 4.6 | no | no | no | good | no | no | ckd |
df.describe()
| age | blood_pressure | specific_gravity | albumin | sugar | blood_glucose_random | blood_urea | serum_creatinine | sodium | potassium | haemoglobin | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 391.000000 | 388.000000 | 353.000000 | 354.000000 | 351.000000 | 356.000000 | 381.000000 | 383.000000 | 313.000000 | 312.000000 | 348.000000 |
| mean | 51.483376 | 76.469072 | 1.017408 | 1.016949 | 0.450142 | 148.036517 | 57.425722 | 3.072454 | 137.528754 | 4.627244 | 12.526437 |
| std | 17.169714 | 13.683637 | 0.005717 | 1.352679 | 1.099191 | 79.281714 | 50.503006 | 5.741126 | 10.408752 | 3.193904 | 2.912587 |
| min | 2.000000 | 50.000000 | 1.005000 | 0.000000 | 0.000000 | 22.000000 | 1.500000 | 0.400000 | 4.500000 | 2.500000 | 3.100000 |
| 25% | 42.000000 | 70.000000 | 1.010000 | 0.000000 | 0.000000 | 99.000000 | 27.000000 | 0.900000 | 135.000000 | 3.800000 | 10.300000 |
| 50% | 55.000000 | 80.000000 | 1.020000 | 0.000000 | 0.000000 | 121.000000 | 42.000000 | 1.300000 | 138.000000 | 4.400000 | 12.650000 |
| 75% | 64.500000 | 80.000000 | 1.020000 | 2.000000 | 0.000000 | 163.000000 | 66.000000 | 2.800000 | 142.000000 | 4.900000 | 15.000000 |
| max | 90.000000 | 180.000000 | 1.025000 | 5.000000 | 5.000000 | 490.000000 | 391.000000 | 76.000000 | 163.000000 | 47.000000 | 17.800000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 400 entries, 0 to 399 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 391 non-null float64 1 blood_pressure 388 non-null float64 2 specific_gravity 353 non-null float64 3 albumin 354 non-null float64 4 sugar 351 non-null float64 5 red_blood_cells 248 non-null object 6 pus_cell 335 non-null object 7 pus_cell_clumps 396 non-null object 8 bacteria 396 non-null object 9 blood_glucose_random 356 non-null float64 10 blood_urea 381 non-null float64 11 serum_creatinine 383 non-null float64 12 sodium 313 non-null float64 13 potassium 312 non-null float64 14 haemoglobin 348 non-null float64 15 packed_cell_volume 330 non-null object 16 white_blood_cell_count 295 non-null object 17 red_blood_cell_count 270 non-null object 18 hypertension 398 non-null object 19 diabetes_mellitus 398 non-null object 20 coronary_artery_disease 398 non-null object 21 appetite 399 non-null object 22 peda_edema 399 non-null object 23 aanemia 399 non-null object 24 class 400 non-null object dtypes: float64(11), object(14) memory usage: 78.2+ KB
As we can see that 'packed_cell_volume', 'white_blood_cell_count' and 'red_blood_cell_count' are object type. We need to change them to numerical dtype.
# converting necessary columns to numerical type
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 400 entries, 0 to 399 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 391 non-null float64 1 blood_pressure 388 non-null float64 2 specific_gravity 353 non-null float64 3 albumin 354 non-null float64 4 sugar 351 non-null float64 5 red_blood_cells 248 non-null object 6 pus_cell 335 non-null object 7 pus_cell_clumps 396 non-null object 8 bacteria 396 non-null object 9 blood_glucose_random 356 non-null float64 10 blood_urea 381 non-null float64 11 serum_creatinine 383 non-null float64 12 sodium 313 non-null float64 13 potassium 312 non-null float64 14 haemoglobin 348 non-null float64 15 packed_cell_volume 329 non-null float64 16 white_blood_cell_count 294 non-null float64 17 red_blood_cell_count 269 non-null float64 18 hypertension 398 non-null object 19 diabetes_mellitus 398 non-null object 20 coronary_artery_disease 398 non-null object 21 appetite 399 non-null object 22 peda_edema 399 non-null object 23 aanemia 399 non-null object 24 class 400 non-null object dtypes: float64(14), object(11) memory usage: 78.2+ KB
# Extracting categorical and numerical columns
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']
# looking at unique values in categorical columns
for col in cat_cols:
print(f"{col} has {df[col].unique()} values\n")
red_blood_cells has [nan 'normal' 'abnormal'] values pus_cell has ['normal' 'abnormal' nan] values pus_cell_clumps has ['notpresent' 'present' nan] values bacteria has ['notpresent' 'present' nan] values hypertension has ['yes' 'no' nan] values diabetes_mellitus has ['yes' 'no' ' yes' '\tno' '\tyes' nan] values coronary_artery_disease has ['no' 'yes' '\tno' nan] values appetite has ['good' 'poor' nan] values peda_edema has ['no' 'yes' nan] values aanemia has ['no' 'yes' nan] values class has ['ckd' 'ckd\t' 'notckd'] values
There is some ambugity present in the columns we have to remove that.
# replace incorrect values
df['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
df['coronary_artery_disease'] = df['coronary_artery_disease'].replace(to_replace = '\tno', value='no')
df['class'] = df['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})
#--------------------------
df['class'] = df['class'].map({'ckd': 0, 'not ckd': 1})
df['class'] = pd.to_numeric(df['class'], errors='coerce')
#--------------------------
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']
for col in cols:
print(f"{col} has {df[col].unique()} values\n")
diabetes_mellitus has ['yes' 'no' nan] values coronary_artery_disease has ['no' 'yes' nan] values class has [0 1] values
# checking numerical features distribution
plt.figure(figsize = (20, 15))
plotnumber = 1
for column in num_cols:
if plotnumber <= 14:
ax = plt.subplot(3, 5, plotnumber)
sns.distplot(df[column])
plt.xlabel(column)
plotnumber += 1
plt.tight_layout()
plt.show()
Skewness is present in some of the columns.
# looking at categorical columns
plt.figure(figsize = (20, 15))
plotnumber = 1
for column in cat_cols:
if plotnumber <= 11:
ax = plt.subplot(3, 4, plotnumber)
sns.countplot(df[column], palette = 'rocket')
plt.xlabel(column)
plotnumber += 1
plt.tight_layout()
plt.show()
# heatmap of data
plt.figure(figsize = (15, 8))
sns.heatmap(df.corr(), annot = True, linewidths = 2, linecolor = 'lightgrey')
plt.show()
# Let's list all the available columns we are going to operate fron now on.
df.columns
Index(['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
'potassium', 'haemoglobin', 'packed_cell_volume',
'white_blood_cell_count', 'red_blood_cell_count', 'hypertension',
'diabetes_mellitus', 'coronary_artery_disease', 'appetite',
'peda_edema', 'aanemia', 'class'],
dtype='object')
# defining functions to create plot
def violin(col):
fig = px.violin(df, y=col, x="class", color="class", box=True, template = 'plotly_dark')
return fig.show()
def kde(col):
grid = sns.FacetGrid(df, hue="class", height = 6, aspect=2)
grid.map(sns.kdeplot, col)
grid.add_legend()
def scatter(col1, col2):
fig = px.scatter(df, x=col1, y=col2, color="class", template = 'plotly_dark')
return fig.show()
violin('red_blood_cell_count')
kde('red_blood_cell_count')
violin('white_blood_cell_count')
kde('white_blood_cell_count')
violin('packed_cell_volume')
kde('packed_cell_volume')
violin('haemoglobin')
kde('haemoglobin')
violin('albumin')
kde('albumin')
violin('blood_glucose_random')
kde('blood_glucose_random')
violin('sodium')
kde('sodium')
violin('blood_urea')
kde('blood_urea')
violin('specific_gravity')
kde('specific_gravity')
scatter('haemoglobin', 'packed_cell_volume')
scatter('red_blood_cell_count', 'packed_cell_volume')
scatter('red_blood_cell_count', 'albumin')
scatter('sugar', 'blood_glucose_random')
scatter('packed_cell_volume','blood_urea')
px.bar(df, x="specific_gravity", y="packed_cell_volume", color='class', barmode='group', template = 'plotly_dark', height = 400)
px.bar(df, x="specific_gravity", y="albumin", color='class', barmode='group', template = 'plotly_dark', height = 400)
px.bar(df, x="blood_pressure", y="packed_cell_volume", color='class', barmode='group', template = 'plotly_dark', height = 400)
px.bar(df, x="blood_pressure", y="haemoglobin", color='class', barmode='group', template = 'plotly_dark', height = 400)
# checking for null values
df.isna().sum().sort_values(ascending = False)
red_blood_cells 152 red_blood_cell_count 131 white_blood_cell_count 106 potassium 88 sodium 87 packed_cell_volume 71 pus_cell 65 haemoglobin 52 sugar 49 specific_gravity 47 albumin 46 blood_glucose_random 44 blood_urea 19 serum_creatinine 17 blood_pressure 12 age 9 bacteria 4 pus_cell_clumps 4 hypertension 2 diabetes_mellitus 2 coronary_artery_disease 2 appetite 1 peda_edema 1 aanemia 1 class 0 dtype: int64
df[num_cols].isnull().sum()
age 9 blood_pressure 12 specific_gravity 47 albumin 46 sugar 49 blood_glucose_random 44 blood_urea 19 serum_creatinine 17 sodium 87 potassium 88 haemoglobin 52 packed_cell_volume 71 white_blood_cell_count 106 red_blood_cell_count 131 dtype: int64
df[cat_cols].isnull().sum()
red_blood_cells 152 pus_cell 65 pus_cell_clumps 4 bacteria 4 hypertension 2 diabetes_mellitus 2 coronary_artery_disease 2 appetite 1 peda_edema 1 aanemia 1 class 0 dtype: int64
# filling null values, we will use two methods, random sampling for higher null values and
# mean/mode sampling for lower null values
def random_value_imputation(feature):
random_sample = df[feature].dropna().sample(df[feature].isna().sum())
random_sample.index = df[df[feature].isnull()].index
df.loc[df[feature].isnull(), feature] = random_sample
def impute_mode(feature):
mode = df[feature].mode()[0]
df[feature] = df[feature].fillna(mode)
# -----------------
# filling num_cols null values using random sampling method
for col in num_cols:
random_value_imputation(col)
df[num_cols].isnull().sum()
age 0 blood_pressure 0 specific_gravity 0 albumin 0 sugar 0 blood_glucose_random 0 blood_urea 0 serum_creatinine 0 sodium 0 potassium 0 haemoglobin 0 packed_cell_volume 0 white_blood_cell_count 0 red_blood_cell_count 0 dtype: int64
# filling "red_blood_cells" and "pus_cell" using random sampling method and rest of cat_cols using mode imputation
random_value_imputation('red_blood_cells')
random_value_imputation('pus_cell')
for col in cat_cols:
impute_mode(col)
df[cat_cols].isnull().sum()
red_blood_cells 0 pus_cell 0 pus_cell_clumps 0 bacteria 0 hypertension 0 diabetes_mellitus 0 coronary_artery_disease 0 appetite 0 peda_edema 0 aanemia 0 class 0 dtype: int64
for col in cat_cols:
print(f"{col} has {df[col].nunique()} categories\n")
red_blood_cells has 2 categories pus_cell has 2 categories pus_cell_clumps has 2 categories bacteria has 2 categories hypertension has 2 categories diabetes_mellitus has 2 categories coronary_artery_disease has 2 categories appetite has 2 categories peda_edema has 2 categories aanemia has 2 categories class has 2 categories
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_cols:
df[col] = le.fit_transform(df[col])
df.head()
| age | blood_pressure | specific_gravity | albumin | sugar | red_blood_cells | pus_cell | pus_cell_clumps | bacteria | blood_glucose_random | blood_urea | serum_creatinine | sodium | potassium | haemoglobin | packed_cell_volume | white_blood_cell_count | red_blood_cell_count | hypertension | diabetes_mellitus | coronary_artery_disease | appetite | peda_edema | aanemia | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 48.0 | 80.0 | 1.020 | 1.0 | 0.0 | 0 | 1 | 0 | 0 | 121.0 | 36.0 | 1.2 | 142.0 | 5.0 | 15.4 | 44.0 | 7800.0 | 5.2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1 | 7.0 | 50.0 | 1.020 | 4.0 | 0.0 | 1 | 1 | 0 | 0 | 75.0 | 18.0 | 0.8 | 145.0 | 5.7 | 11.3 | 38.0 | 6000.0 | 6.4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 62.0 | 80.0 | 1.010 | 2.0 | 3.0 | 1 | 1 | 0 | 0 | 423.0 | 53.0 | 1.8 | 131.0 | 4.9 | 9.6 | 31.0 | 7500.0 | 3.9 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 3 | 48.0 | 70.0 | 1.005 | 4.0 | 0.0 | 1 | 0 | 1 | 0 | 117.0 | 56.0 | 3.8 | 111.0 | 2.5 | 11.2 | 32.0 | 6700.0 | 3.9 | 1 | 0 | 0 | 1 | 1 | 1 | 0 |
| 4 | 51.0 | 80.0 | 1.010 | 2.0 | 0.0 | 1 | 1 | 0 | 0 | 106.0 | 26.0 | 1.4 | 139.0 | 4.9 | 11.6 | 35.0 | 7300.0 | 4.6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ind_col = [col for col in df.columns if col != 'class']
dep_col = 'class'
X = df[ind_col]
y = df[dep_col]
# splitting data intp training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of knn
knn_acc = accuracy_score(y_test, knn.predict(X_test))
print(f"Training Accuracy of KNN is {accuracy_score(y_train, knn.predict(X_train))}")
print(f"Test Accuracy of KNN is {knn_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, knn.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, knn.predict(X_test))}")
Training Accuracy of KNN is 0.7875
Test Accuracy of KNN is 0.7
Confusion Matrix :-
[[38 14]
[10 18]]
Classification Report :-
precision recall f1-score support
0 0.79 0.73 0.76 52
1 0.56 0.64 0.60 28
accuracy 0.70 80
macro avg 0.68 0.69 0.68 80
weighted avg 0.71 0.70 0.70 80
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of decision tree
dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dtc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(X_test))}")
Training Accuracy of Decision Tree Classifier is 1.0
Test Accuracy of Decision Tree Classifier is 0.9625
Confusion Matrix :-
[[51 1]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 0.98 0.97 52
1 0.96 0.93 0.95 28
accuracy 0.96 80
macro avg 0.96 0.95 0.96 80
weighted avg 0.96 0.96 0.96 80
# hyper parameter tuning of decision tree
from sklearn.model_selection import GridSearchCV
grid_param = {
'criterion' : ['gini', 'entropy'],
'max_depth' : [3, 5, 7, 10],
'splitter' : ['best', 'random'],
'min_samples_leaf' : [1, 2, 3, 5, 7],
'min_samples_split' : [1, 2, 3, 5, 7],
'max_features' : ['auto', 'sqrt', 'log2']
}
grid_search_dtc = GridSearchCV(dtc, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search_dtc.fit(X_train, y_train)
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [3, 5, 7, 10],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 3, 5, 7],
'min_samples_split': [1, 2, 3, 5, 7],
'splitter': ['best', 'random']},
verbose=1)
# best parameters and best score
print(grid_search_dtc.best_params_)
print(grid_search_dtc.best_score_)
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 5, 'splitter': 'best'}
0.9875
# best estimator
dtc = grid_search_dtc.best_estimator_
# accuracy score, confusion matrix and classification report of decision tree
dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dtc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(X_test))}")
Training Accuracy of Decision Tree Classifier is 0.934375
Test Accuracy of Decision Tree Classifier is 0.8875
Confusion Matrix :-
[[43 9]
[ 0 28]]
Classification Report :-
precision recall f1-score support
0 1.00 0.83 0.91 52
1 0.76 1.00 0.86 28
accuracy 0.89 80
macro avg 0.88 0.91 0.88 80
weighted avg 0.91 0.89 0.89 80
from sklearn.ensemble import RandomForestClassifier
rd_clf = RandomForestClassifier(criterion = 'entropy', max_depth = 11, max_features = 'auto', min_samples_leaf = 2, min_samples_split = 3, n_estimators = 130)
rd_clf.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of random forest
rd_clf_acc = accuracy_score(y_test, rd_clf.predict(X_test))
print(f"Training Accuracy of Random Forest Classifier is {accuracy_score(y_train, rd_clf.predict(X_train))}")
print(f"Test Accuracy of Random Forest Classifier is {rd_clf_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, rd_clf.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, rd_clf.predict(X_test))}")
Training Accuracy of Random Forest Classifier is 0.996875
Test Accuracy of Random Forest Classifier is 0.975
Confusion Matrix :-
[[52 0]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of ada boost
ada_acc = accuracy_score(y_test, ada.predict(X_test))
print(f"Training Accuracy of Ada Boost Classifier is {accuracy_score(y_train, ada.predict(X_train))}")
print(f"Test Accuracy of Ada Boost Classifier is {ada_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, ada.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, ada.predict(X_test))}")
Training Accuracy of Ada Boost Classifier is 1.0
Test Accuracy of Ada Boost Classifier is 0.9875
Confusion Matrix :-
[[52 0]
[ 1 27]]
Classification Report :-
precision recall f1-score support
0 0.98 1.00 0.99 52
1 1.00 0.96 0.98 28
accuracy 0.99 80
macro avg 0.99 0.98 0.99 80
weighted avg 0.99 0.99 0.99 80
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of gradient boosting classifier
gb_acc = accuracy_score(y_test, gb.predict(X_test))
print(f"Training Accuracy of Gradient Boosting Classifier is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Test Accuracy of Gradient Boosting Classifier is {gb_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, gb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, gb.predict(X_test))}")
Training Accuracy of Gradient Boosting Classifier is 1.0
Test Accuracy of Gradient Boosting Classifier is 0.975
Confusion Matrix :-
[[52 0]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
sgb = GradientBoostingClassifier(max_depth = 4, subsample = 0.90, max_features = 0.75, n_estimators = 200)
sgb.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of stochastic gradient boosting classifier
sgb_acc = accuracy_score(y_test, sgb.predict(X_test))
print(f"Training Accuracy of Stochastic Gradient Boosting is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Test Accuracy of Stochastic Gradient Boosting is {sgb_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, sgb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, sgb.predict(X_test))}")
Training Accuracy of Stochastic Gradient Boosting is 1.0
Test Accuracy of Stochastic Gradient Boosting is 0.975
Confusion Matrix :-
[[52 0]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
from xgboost import XGBClassifier
xgb = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.5, max_depth = 5, n_estimators = 150)
xgb.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of xgboost
xgb_acc = accuracy_score(y_test, xgb.predict(X_test))
print(f"Training Accuracy of XgBoost is {accuracy_score(y_train, xgb.predict(X_train))}")
print(f"Test Accuracy of XgBoost is {xgb_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, xgb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, xgb.predict(X_test))}")
[11:58:10] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Training Accuracy of XgBoost is 1.0
Test Accuracy of XgBoost is 0.975
Confusion Matrix :-
[[52 0]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train)
Learning rate set to 0.432149 0: learn: 0.2396296 total: 162ms remaining: 1.46s 1: learn: 0.1351223 total: 165ms remaining: 661ms 2: learn: 0.0777668 total: 168ms remaining: 392ms 3: learn: 0.0514722 total: 171ms remaining: 256ms 4: learn: 0.0365439 total: 173ms remaining: 173ms 5: learn: 0.0296262 total: 176ms remaining: 117ms 6: learn: 0.0219103 total: 179ms remaining: 76.6ms 7: learn: 0.0165990 total: 182ms remaining: 45.4ms 8: learn: 0.0144892 total: 184ms remaining: 20.5ms 9: learn: 0.0134946 total: 187ms remaining: 0us
<catboost.core.CatBoostClassifier at 0x253d3d3f820>
# accuracy score, confusion matrix and classification report of cat boost
cat_acc = accuracy_score(y_test, cat.predict(X_test))
print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, cat.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, cat.predict(X_test))}")
Training Accuracy of Cat Boost Classifier is 1.0
Test Accuracy of Cat Boost Classifier is 0.975
Confusion Matrix :-
[[52 0]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of extra trees classifier
etc_acc = accuracy_score(y_test, etc.predict(X_test))
print(f"Training Accuracy of Extra Trees Classifier is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Test Accuracy of Extra Trees Classifier is {etc_acc} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, etc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, etc.predict(X_test))}")
Training Accuracy of Extra Trees Classifier is 1.0
Test Accuracy of Extra Trees Classifier is 0.975
Confusion Matrix :-
[[52 0]
[ 2 26]]
Classification Report :-
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)
# accuracy score, confusion matrix and classification report of lgbm classifier
lgbm_acc = accuracy_score(y_test, lgbm.predict(X_test))
print(f"Training Accuracy of LGBM Classifier is {accuracy_score(y_train, lgbm.predict(X_train))}")
print(f"Test Accuracy of LGBM Classifier is {lgbm_acc} \n")
print(f"{confusion_matrix(y_test, lgbm.predict(X_test))}\n")
print(classification_report(y_test, lgbm.predict(X_test)))
Training Accuracy of LGBM Classifier is 1.0
Test Accuracy of LGBM Classifier is 0.975
[[52 0]
[ 2 26]]
precision recall f1-score support
0 0.96 1.00 0.98 52
1 1.00 0.93 0.96 28
accuracy 0.97 80
macro avg 0.98 0.96 0.97 80
weighted avg 0.98 0.97 0.97 80
models = pd.DataFrame({
'Model' : [ 'K-Nearest Neighbor', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
'Gradient Boosting Classifier', 'Stochastic Gradient Boosting', 'XgBoost', 'Cat Boost', 'Extra Trees Classifier'],
'Score' : [knn_acc, dtc_acc, rd_clf_acc, ada_acc, gb_acc, sgb_acc, xgb_acc, cat_acc, etc_acc]
})
models.sort_values(by = 'Score', ascending = False)
| Model | Score | |
|---|---|---|
| 3 | Ada Boost Classifier | 0.9875 |
| 2 | Random Forest Classifier | 0.9750 |
| 4 | Gradient Boosting Classifier | 0.9750 |
| 5 | Stochastic Gradient Boosting | 0.9750 |
| 6 | XgBoost | 0.9750 |
| 7 | Cat Boost | 0.9750 |
| 8 | Extra Trees Classifier | 0.9750 |
| 1 | Decision Tree Classifier | 0.8875 |
| 0 | K-Nearest Neighbor | 0.7000 |
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark',
title = 'Models Comparison')